/*==============================================================================
FILE NAME: Process_Emissions_Events.do
INPUTS: Emissions_events.dta
OUTPUTS: Emissions_events_clean.dta
==============================================================================*/

/* Set directory if working independently through code */
if c(username)=="" { //insert username
    global rootdir "" // insert root path
    global processed_data "$rootdir/processed_data"  // Define global paths for replication package
} 

// Prevent Stata from pausing output
set more off

// Load raw emissions events data
use "$processed_data/Emissions_events.dta", clear

// Drop unnecessary columns
drop TCEQDocketNo NoticeType PhysicalAddress PhysicalAddress2 State PhysicalLocation N P InvestigationStatusDate EnforcementCaseNo ActivityCode

// Rename key columns for clarity
rename RegulatedEntityNo RN  
rename CustomerNo CN
rename InvestigationNo IN
rename ViolationTrackNo VN
rename IncidentNo EEN

// Handle alternate zip code column
rename O ZipCode_alt

// Convert zip code columns to numeric
destring ZipCode ZipCode_alt, replace

// Flag and fill missing zip codes using alternate column
gen ZipFlag_EE = 1 if ZipCode == . & ZipCode_alt != .
replace ZipCode = ZipCode_alt if ZipCode == . & ZipCode_alt != .
drop ZipCode_alt ZipCodeExt

// Restrict zip codes to valid Texas range
replace ZipCode = . if ZipCode < 73301 
replace ZipCode = . if ZipCode > 88589

// Fill missing city values using NearestCity
replace City = NearestCity if City == "" & NearestCity != ""

// Drop NearestCity column
drop NearestCity

// Rename region, city, zip, and county columns for emissions events
foreach x in TCEQRegion City ZipCode County {
    rename `x' `x'_EE
}

// Create emissions event indicator
gen EmissionsEvent = 1

// Rename event type and date columns for clarity
rename IncidentType EE_Type
rename IncidentStartDate EE_Start_Date
rename IncidentEndDate EE_End_Date

// Convert date columns to Stata date format
foreach x in InvestigationStartDate EE_Start_Date EE_End_Date {
    gen temp = date(`x',"MDY")
    drop `x' 
    rename temp `x'
    format `x' %td
}

// Clean up negative dates
replace EE_Start_Date = . if EE_Start_Date < 0
replace EE_End_Date = . if EE_End_Date < 0

// Extract day, month, and year from event start date
gen day = day(EE_Start_Date)
gen month = month(EE_Start_Date)
gen year = year(EE_Start_Date)

// Drop events after 2020
drop if year > 2020

// Save cleaned emissions events data
save "$processed_data/Emissions_events_clean.dta", replace